## 2016年厦门大数据大赛
## 第二题:基于大数据的商品销售预测及关联销售挖掘
## author:Daitu
## 2016-7-20
## 工作:读取预处理后的数据进行探索分析;
## 分析top500的数据有什么趋势等,探索性可视化分析
## 更改工作文件夹------------------------------------
setwd("/Users/daitu/数据分析/2016ABD")
getwd()
## [1] "/Users/daitu/数据分析/2016ABD"
## 加载所需要的包-----------------------------------
library(VIM)
## Warning: package 'VIM' was built under R version 3.2.5
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
## VIM is ready to use.
## Since version 4.0.0 the GUI is in its own package VIMGUI.
##
## Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.4
library(GGally)
## Warning: package 'GGally' was built under R version 3.2.5
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 3.2.4
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(treemap)
## Warning: package 'treemap' was built under R version 3.2.4
library(d3treeR)
library(readr)
## 导入数据集####--------------------------------------------
## 销售前500数据
load("第二题数据/top500df.RData")
dim(top500df)
## [1] 8964 12
table(top500df$day)
##
## 1 4 5 6 17 18 19 20 21 22 23 24 25 26 27 28 29 30
## 498 498 498 498 498 498 498 498 498 498 498 498 498 498 498 498 498 498
top500df <- tbl_df(top500df)
top500df$time <- paste(top500df$year,top500df$month,top500df$day,sep = "/")
## 对数据进行分组后查看
n_distinct(top500df$brand_name) ## 一共209个品牌
## [1] 209
n_distinct(top500df$item_number) ## 一共出现了949个商品
## [1] 949
top500gu <- top500df %>%
dplyr::group_by(brand_name,item_number,time,shop_type) %>%
summarise(assessment_num = mean(assessment_num),
monthly_sales_num = mean(monthly_sales_num),
shop_num = mean(shop_num),
avg_price = mean(avg_price),
monthly_sales = mean(monthly_sales),
like_num = mean(like_num))
## 对变量的缺失值探索性分析 ####--------------------------------
VIM::aggr(top500gu,prop = TRUE)

## 1:可以看出大约有10%的店铺销售方式数据缺失,与之相对应的销售店铺的数目均为0
## 可以发现店铺销售方式缺失的数据的缺失值主要集中在18,19,20这三天
## 对于店铺销售方式可以使用它后面的数据来填补缺失值,这样能减少错误的概率
## 对于的销售店铺的数目,我们也可以使用他后面的数据来填补0值
## 2:平均单价数据和月销售额数据大约有15%的数据缺失,并且还是同时缺失
## 首先将缺失值对应的0元素转化为缺失值
top500gu$shop_num[is.na(top500gu$shop_type)] <- NA
## 使用后面的数值来填补这两列的缺失值
top500gu[,c(4,7)] <- apply(top500gu[,c(4,7)], 2, na.locf)
## 查看缺失值的情况
VIM::aggr(top500gu,prop = TRUE)

## 只剩下平均价格和月销售额存在缺失值,
## 并且该类性的缺失值是针对某一种商品的整体数据缺失,
## 对于这种类型的缺失值,几乎无法进行缺失值的插补处理
## 所以决定删除这些数据
top500gu <- top500gu %>%
dplyr::filter(!is.na(avg_price))
dim(top500gu)
## [1] 7743 10
## 查看缺失值的情况
VIM::aggr(top500gu)

## 可以发现已经不存在缺失值了
## top500数据的可视化探索####------------------------------
## 添加数据:月销量 = 月销售额 / 平均价格
top500gu <- top500gu %>%
mutate(number = round(monthly_sales / avg_price)) %>%
arrange(desc(monthly_sales))
## 查看原始数据中的销量 和 计算出来的月销量之间的差异
ggplot(data = top500gu, aes(x = monthly_sales_num , y = number)) +
theme_grey(base_family = "STKaiti") +
geom_point(aes(colour = shop_type),size = 1) +
theme(legend.position = "top") +
geom_abline(intercept = 1,size = 0.2) +
labs(x = "原始月销售量",y = "计算月销售量",title = "月销售量检查")

## 从上面的图像可以看出两个销量并不完全一样的,但是总体趋势还是在一条直线上的
## 并且:(1):TB_TMALL类型的店铺的月销售量更高,
## (2) :TB_JISHI的销量普遍偏低的
## 还是无法验证数据中的销量数据是否正确,假设销量数据是正确的
top500gu$number <- NULL #删除生成的销售数据
# top500gu中抽取50种商品的销量的盒形图
item <- sample(unique(top500gu$item_number),size = 50) # 从所有的商品中选择50个
ggplot(top500gu[top500gu$item_number %in% item,]) +
theme_bw(base_family = "STKaiti") +
geom_boxplot(aes(x = item_number,y = monthly_sales_num,fill = shop_type))+
theme(axis.text.x = element_text(angle = 90)) +
labs(x = "商品",y = "月销量",title = "商品销量盒形图")

## 可以查看商品销量的差别
# top500gu中抽取50个品牌的销量的盒形图
brand <- sample(unique(top500gu$brand_name),size = 50)
ggplot(top500gu[top500gu$brand_name %in% brand,]) +
theme_bw(base_family = "STKaiti") +
geom_boxplot(aes(x = brand_name,y = monthly_sales_num,fill = shop_type))+
theme(axis.text.x = element_text(angle = 90)) +
labs(x = "品牌",y = "月销量",title = "品牌销量盒形图")

## 跨度较大,是由于不同的商品引起的
## 将数据按照商品分组,对数据进行可视化#### ----------------------------
top500gu$shop_num <- as.numeric(top500gu$shop_num)
top500_item <- top500gu %>%
dplyr::group_by(item_number,brand_name,shop_type) %>% # 确定商品的唯一性
summarise(assessment_num = mean(assessment_num),
monthly_sales_num = mean(monthly_sales_num),
shop_num = mean(shop_num),
avg_price = mean(avg_price),
monthly_sales = mean(monthly_sales),
like_num = mean(like_num)) %>%
arrange(desc(monthly_sales_num))
dim(top500_item)
## [1] 902 9
as.data.frame(head(top500_item))
## item_number brand_name shop_type assessment_num monthly_sales_num
## 1 0 Nike TB_JISHI 132.07692 9220.538
## 2 000 Nike TB_JISHI 447.83333 2712.667
## 3 000000 Nike TB_JISHI 316.50000 2370.750
## 4 000001 Nike TB_JISHI 35.15385 8120.231
## 5 0001 顺诚 TB_JISHI 1486.09091 2580.091
## 6 0001 Adidas TB_TMALL 71.00000 5024.000
## shop_num avg_price monthly_sales like_num
## 1 48.692308 1891.92239 2587.188 352.84615
## 2 58.666667 424.65776 52833.667 2227.16667
## 3 45.125000 805.39487 38400.167 1886.25000
## 4 3.384615 1353.88462 21.000 44.07692
## 5 6.636364 37.64329 92865.455 739.45455
## 6 1.000000 1.00000 5024.000 2.00000
## 有902件产品,上榜国top500
## 绘制平行坐标图进行分析 ---------------------------------------
lab_x <- c("评价数目","月销量","销售店铺数","单价","月销售额","喜欢数目")
p1 <- ggparcoord(top500_item,columns = c(4:dim(top500_item)[2]),
groupColumn = "shop_type") +
theme_gray(base_family = "STKaiti") +
theme(legend.position = "top") +
scale_x_discrete(labels = lab_x) +
labs(x= "",y = "标准化数值",title = "top500平行坐标图")
p2 <- ggparcoord(top500_item,columns = c(4:dim(top500_item)[2]),
groupColumn = "shop_type",scale = "uniminmax") +
theme_gray(base_family = "STKaiti") +
theme(legend.position = "top") +
scale_x_discrete(labels = lab_x) +
labs(x= "",y = "单位区间数值",title = "top500平行坐标图")
grid.arrange(p1,p2,nrow = 2)

##
## 查看散点图矩阵----------------------------------
ggscatmat(data = as.data.frame(top500_item),columns = c(4:dim(top500_item)[2]),
color = "shop_type",corMethod = "pearson") +
theme_bw(base_family = "STKaiti") +
theme(legend.position = "top") +
ggtitle("散点图矩阵")

## 查看商品的销售额和单价的关系####-----------------
t1 <- treemap(top500_item,index = c("brand_name","item_number"),vSize = "monthly_sales",
vColor = "avg_price",type="value",fontfamily.title = "STKaiti",
fontfamily.labels = "STKaiti",fontfamily.legend = "STKaiti",
title = "产品的月销售额", title.legend = "平均售价")
d3tree( t1,rootname = "品牌~商品号+color = 单价" )



## 查看商品的销售量和单价的关系----------------
t1 <- treemap(top500_item,index = c("brand_name","item_number"),vSize = "monthly_sales_num",
vColor = "avg_price",type="value",fontfamily.title = "STKaiti",
fontfamily.labels = "STKaiti",fontfamily.legend = "STKaiti",
title = "商品的月销售量", title.legend = "平均售价")
d3tree( t1,rootname = "品牌~商品号+color = 单价" )



## 对商品和店铺类型分组,确定产品的唯一性#### ----------------------------------
top500brand <- top500_item %>%
dplyr::group_by(brand_name,shop_type) %>%
summarise(item_sum = n_distinct(item_number), #该品牌有多少热销品
assessment_sum = sum(assessment_num),# 总评价数目
monthly_sales_sum = sum(monthly_sales_num),# 平均月总销售总量
shop_sum = sum(shop_num), #销售店铺总数
monthly_sales.sum = sum(monthly_sales), #平均月销售总额
avg_price_me = mean(avg_price),# 平均售价
like_num_sum = sum(like_num)) %>% #喜欢的数量
mutate(brand_type = paste(brand_name,shop_type,sep = "~")) %>%
arrange(desc(monthly_sales.sum))
dim(top500brand)
## [1] 157 10
as.data.frame(head(top500brand))
## brand_name shop_type item_sum assessment_sum monthly_sales_sum
## 1 0595 TB_JISHI 1 28.64706 2354.588
## 2 1 TB_JISHI 1 2446.00000 3246.500
## 3 139 TB_JISHI 2 3247.58333 4448.917
## 4 1q TB_JISHI 1 433.00000 6817.667
## 5 3515 TB_JISHI 1 20.50000 5507.500
## 6 361 TB_TMALL 29 247196.73414 84062.560
## shop_sum monthly_sales.sum avg_price_me like_num_sum brand_type
## 1 1.764706 35313.471 12.32353 1.941176e+00 0595~TB_JISHI
## 2 1.000000 94148.500 29.00000 1.323750e+03 1~TB_JISHI
## 3 2.000000 126804.250 28.50000 5.023658e+04 139~TB_JISHI
## 4 1.000000 6817.667 1.00000 2.111111e+00 1q~TB_JISHI
## 5 1.000000 5507.500 1.00000 3.500000e+00 3515~TB_JISHI
## 6 166.065126 12119658.562 138.07639 5.056761e+05 361~TB_TMALL
n_distinct(top500brand$brand_name)
## [1] 138
## 查看不同品牌的平均月销售总额的树图 ####
## 销售总额与单价的关系---------------------------------
t1 <- treemap(top500brand,index = c("shop_type","brand_name"),vSize = "monthly_sales.sum",
vColor = "avg_price_me",type="value",fontfamily.title = "STKaiti",
fontfamily.labels = "STKaiti",fontfamily.legend = "STKaiti",
title = "品牌的月销售额", title.legend = "平均售价")
d3tree( t1,rootname = "品牌~销售方式+color = 单价" )


d3tree2( t1,rootname = "品牌~销售方式" )



## 销售总额与受喜欢的程度关系---------------------------------
t2 <- treemap(top500brand,index = c("shop_type","brand_name"),vSize = "monthly_sales.sum",
vColor = "like_num_sum",type="value",fontfamily.title = "STKaiti",
fontfamily.labels = "STKaiti",fontfamily.legend = "STKaiti",
title = "品牌的月销售额", title.legend = "喜欢数量")
d3tree( t2,rootname = "品牌~销售方式+color = 喜欢数量" )


d3tree2( t2,rootname = "品牌~销售方式" )



## 查看不同品牌的平均月销售数量的树图 ####
## 销售总额与单价的关系---------------------------------
t1 <- treemap(top500brand,index = c("shop_type","brand_name"),vSize = "monthly_sales_sum",
vColor = "avg_price_me",type="value",fontfamily.title = "STKaiti",
fontfamily.labels = "STKaiti",fontfamily.legend = "STKaiti",
title = "品牌的月销售量", title.legend = "平均售价")
d3tree( t1,rootname = "品牌~销售方式+color = 单价" )


d3tree2( t1,rootname = "品牌~销售方式" )



## 销售总额与受喜欢的程度关系---------------------------------
t2 <- treemap(top500brand,index = c("shop_type","brand_name"),vSize = "monthly_sales_sum",
vColor = "like_num_sum",type="value",fontfamily.title = "STKaiti",
fontfamily.labels = "STKaiti",fontfamily.legend = "STKaiti",
title = "品牌的月销售额", title.legend = "喜欢数量")
d3tree( t2,rootname = "品牌~销售方式+color = 喜欢数量" )


d3tree2( t2,rootname = "品牌~销售方式" )



## 将整理号的数据保存下来备用####-----------------------------
write.csv(top500gu,file = "第二题数据/top500group.csv",fileEncoding = "UTF-8",
quote = FALSE,row.names = FALSE)
write_csv(top500_item,path = "第二题数据/top500item.csv")